In [ ]:
import csv
import nltk
import math
import collections
from textblob import TextBlob
from pprint import pprint
In [ ]:
csvfile = open(
reader =
data = []
for line in reader:
line[3] = line[3].decode('utf-8')
In [ ]:
# getting the number of rows
In [ ]:
#taking a look at the first row
In [ ]:
comment_text =
In [ ]:
comment_text
In [ ]:
# strings are like lists of characters
In [ ]:
# use a colon for start:end indexes
In [ ]:
# they can be stuck together easily
In [ ]:
# and split apart
comment_text
In [ ]:
split_on_questions =
In [ ]:
# it's easy to strip whitespace off of them
for string in split_on_questions:
In [ ]:
# and cast them to one case
cleaned =
cleaned
In [ ]:
# join them back together
In [ ]:
# and look for substring inside them
CHALLENGE: count the number of times the words "Hilary" or "Clinton" appear in the dataset
In [ ]:
for row in data:
comment_text = row[-1]
In [ ]:
blob = TextBlob(data[80][-1])
blob
In [ ]:
# we can get lists of sentences
In [ ]:
# lists of words
In [ ]:
# lists of "tokens" (punctuation included)
In [ ]:
# even parts of speech and noun phrases
In [ ]:
In [ ]:
word_count = collections.Counter(
In [ ]:
word_count
potential approaches:
In [ ]:
In [ ]:
stopwords = nltk.corpus.stopwords.words('english')
In [ ]:
nltk.download()
In [ ]:
for key in word_count.keys():
In [ ]:
We could continue to add on stopwords as we try to make these keywords better. But it's kind of like playing whack-a-mole
An additional solution to The Problem: add a new term to our "representative-ness" measure that accounts for the overall rarity of the word
$$\frac { { n }_{ w } }{ N } $$where ${ n }_{ w }$ is the number of documents containing word $ w $, and $ N $ is the total number of documents.
But we want a potential keyword to have a lower score if it is common in the corpus and a higher score if it is rarer, so we flip it:
$$\frac { N }{ { n }_{ w } } $$It's also common to take the log of this to reduce the amount of disparity between extremely common and extremely uncommon terms.
$$ \log \frac { N }{ { n }_{ w } } $$This is called IDF, or Inverse Document Frequency. Let's calculate it for all the words in our comment dataset!
In [ ]:
N_documents = float(len(data))
word_document_counts =
word_idf = {}
In [ ]:
for row in data[1:]:
blob = TextBlob(row[-1].lower())
In [ ]:
# calculate IDFs
For each word $ w $ in a given document $ D $, we can multiply the term frequency $$\frac { { D }_{ w } }{ { W }_{ D } } $$
where $ { D }_{ w } $ is the number of occurrences of word $ w $ in document $ D $
and $ { W }_{ D } $ is the total number of words in document $ D $
with the word's IDF that we just calculated to get TF-IDF scores, the highest ones being words that likely to be good representatives of that document.
In [ ]:
comment = data[80][-1]
blob = TextBlob(comment.lower())
num_words_in_comment = len(blob.words)
word_count = blob.word_counts
tf_scores = {}
for word, count in word_count.iteritems():
if word not in stopwords and len(word) > 2:
tf_scores[word] =
In [ ]:
tf_idf = {}
for word, tf in tf_scores.iteritems():
tf_idf[word] =
sorted(tf_idf.iteritems(), key=lambda k: k[1], reverse=True)[:5]
Note that TF-IDF can be tweaked in lots of other ways if you aren't getting good results.
It can also be done with "n-grams"— phrases that are n words long to capture multi word phrases like "gay rights" or "hillary clinton"
In [ ]:
from nltk.stem.porter import PorterStemmer
In [ ]:
stemmer = PorterStemmer()
print stemmer.stem('political')
print stemmer.stem('politics')
print stemmer.stem('politician')
In [ ]:
from nltk.text import Text
tokens = TextBlob(data[80][-1]).tokens
text_object = Text(tokens)
text_object.concordance('Hilary')
In [ ]:
blob = TextBlob(data[41][-1])
blob
In [ ]:
blob.sentiment
In [ ]:
blob.sentences[1].sentiment